/*
ULTRA-SIMPLIFIED SYNTHETIC DATA GENERATOR BASED ON REAL CUSIPS
This Stata script generates synthetic versions of:
1. BoardEx datasets (main, individual, organizational network)
2. ISS Director dataset (2007-2018)
3. ISS Director Legacy dataset (1995-2006)

All datasets are based on actual CUSIP-year combinations from cusip_year.dta
*/

clear all
set seed 12345  // Set seed for reproducibility

********************************************************************************
* LOAD CUSIP-YEAR DATASET AND CREATE FOUNDATION
********************************************************************************
// Load the CUSIP-year dataset provided by the user
use cusip_year.dta, clear

// Calculate basic stats
count
local total_cusip_years = r(N)

// Count unique CUSIPs
preserve
duplicates drop cusip, force
count
local n_unique_cusips = r(N)
restore

display as text "Total CUSIP-year combinations: `total_cusip_years'"
display as text "Number of unique CUSIPs: `n_unique_cusips'"

// Calculate average firms per year
bysort year: gen year_count = _N
sum year_count
display as text "Average firms per year: " r(mean)

// Generate company IDs
by cusip, sort: gen first = _n == 1
gen company_id = sum(first)
drop first

// Create common company info dataset
gen companyid = company_id
tempfile cusip_year_full
save `cusip_year_full'

// Create company reference dataset (without year)
preserve
bysort cusip: keep if _n == 1
drop year year_count

// Generate company names
gen companyname = "Company_" + string(companyid)

// Generate sectors
local sectors "Technology Energy Financials Healthcare Consumer Goods Services Utilities Manufacturing Materials Telecommunications"
gen sector = ""
forvalues i = 1/10 {
    local sector`i' : word `i' of `sectors'
    replace sector = "`sector`i''" if floor(10 * runiform()) + 1 == `i'
}

// Generate organization types
gen orgtype = "PUBLIC" 

// Generate ISIN (International Securities Identification Number)
gen isin = "US" + cusip

// Company location - all USA
gen hocountryname = "United States of America"

// Save as company reference dataset
save synthetic_company_reference.dta, replace
tempfile company_reference
save `company_reference'
restore

// Merge company info back to CUSIP-year panel
merge m:1 cusip using `company_reference', keepusing(companyname sector orgtype isin hocountryname) nogen

// For each company-year, determine target board size (around 8 on average)
gen target_board_size = floor(runiform() * 5) + 6  // 6-10 directors
egen total_board_seats = sum(target_board_size)
display as text "Total board seats needed: " total_board_seats

// Save company-year panel
save synthetic_company_year_panel.dta, replace
tempfile company_year_panel
save `company_year_panel'

********************************************************************************
* CREATE SYNTHETIC DIRECTORS
********************************************************************************
// Calculate roughly how many directors we need (allowing for overlap)
local total_directors = round(total_board_seats[1] * 0.7)  // 70% of total seats, allowing for overlap
display as text "Creating `total_directors' synthetic directors"

// Generate directors
clear
set obs `total_directors'

// Generate director IDs
gen directorid = _n

// Name components
gen title = "Mr." if mod(directorid, 4) != 0
replace title = "Ms." if title == ""

gen forename1 = "Director" + string(directorid)
gen forename2 = ""
gen forename3 = ""
gen forename4 = ""
gen surname = "Board" + string(floor(directorid/10))
gen usualname = forename1
gen directorname = title + " " + forename1 + " " + surname
gen suffixtitle = ""

// Demographics
gen gender = "Male" if title == "Mr."
replace gender = "Female" if title == "Ms."
gen nationality = "American"
gen dob = mdy(1, 1, 1960 + mod(directorid, 30))
format dob %td
gen dod = .
format dod %td
gen age = 2020 - (1960 + mod(directorid, 30))
gen networksize = floor(runiform() * 10) + 5
gen directorvisible = 1

// Flag directors who will serve on multiple boards (20%)
gen high_overlap = mod(directorid, 5) == 0

// For ISS datasets
gen director_detail_id = directorid

// Save director reference dataset
save synthetic_director_reference.dta, replace

// Save the boardex_indiv dataset
drop high_overlap director_detail_id
save synthetic_boardex_indiv.dta, replace
export delimited using "synthetic_boardex_indiv.csv", replace
display as text "Created synthetic Boardex individual directors dataset"

********************************************************************************
* BOARDEX DATASET (MAIN BOARD ROLES)
********************************************************************************
// Create board assignments
use `company_year_panel', clear
expand target_board_size
gen board_member_seq = _n

// Assign directors - high overlap directors get ~5 boards each, others get 1-2
gen random_val = runiform()
sort random_val
gen directorid = floor((_n - 1) / 5) + 1
replace directorid = mod(directorid, `total_directors') + 1  // Ensure we stay within range

// Get director information
merge m:1 directorid using synthetic_director_reference.dta, keepusing(directorname gender) keep(1 3) nogen

// Generate role information
local positions "Board Member Executive Committee Member Audit Committee Member Compensation Committee Chairman"
local rolenames "Chairman CEO President CFO Executive Director Non-Executive Director Independent Director"

gen brdposition = ""
gen rolename = ""

forvalues i = 1/5 {
    local position : word `i' of `positions'
    replace brdposition = "`position'" if mod(_n, 5) == `i' - 1
}

forvalues i = 1/7 {
    local role : word `i' of `rolenames'
    replace rolename = "`role'" if mod(_n, 7) == `i' - 1
}

// Non-executive director flag
gen ned = 0
replace ned = 1 if strpos(rolename, "Non-Executive") > 0 | strpos(rolename, "Independent") > 0

// Full text description
gen fulltextdescription = rolename + " and " + brdposition + " at " + companyname

// Role start and end dates
gen datestartrole = mdy(1, 1, year)
gen dateendrole = mdy(12, 31, year + floor(runiform() * 3))
format datestartrole %td
format dateendrole %td

// Calculate board size statistics
bysort companyid year: gen board_size = _N
sum board_size
display as text "Average board size: " r(mean)

// Calculate director overlap statistics
bysort directorid: gen board_count = _N
sum board_count
display as text "Average directorships per director: " r(mean)
display as text "Max boards per director: " r(max)

// Keep only the variables needed
keep companyid directorid year companyname directorname brdposition rolename fulltextdescription ned datestartrole dateendrole hocountryname sector orgtype isin cusip

// Save main boardex dataset
save synthetic_boardex.dta, replace
export delimited using "synthetic_boardex.csv", replace
display as text "Created synthetic Boardex main dataset"

********************************************************************************
* BOARDEX_OTHER DATASET (COMPANY AND ORGANIZATIONAL NETWORK)
********************************************************************************
// Since we need a much simpler approach, let's completely rebuild this dataset
clear
use synthetic_boardex.dta, clear

// Count directors with multiple board seats
bysort directorid: gen board_count = _N
count if board_count > 1
display as text "Directors with multiple seats: " r(N)

// Only keep directors with multiple board seats
keep if board_count > 1
display as text "Creating organizational network dataset..."

// Create a network dataset with all the required variables
gen AssociationType = "Board Overlap"
gen BoardName = companyname + " Board"
gen CompanyName = companyname
gen DirectorName = directorname
gen OverlapYearStart = year(datestartrole)
gen OverlapYearEnd = year(dateendrole)
gen Role = rolename
gen AssociatedRole = rolename
gen ConnCompanyOrgType = orgtype
gen BoardID = companyid
gen ConnCompanyName = "Connected_" + companyname
gen CompanyID = companyid + 1000
gen DirectorID = directorid
gen StartCompanyDateStartRole = datestartrole
gen StartCompanyDateEndRole = dateendrole
gen ConnCompanyDateStartRole = datestartrole + 30  // Offset by 30 days
gen ConnCompanyDateEndRole = dateendrole + 30     // Offset by 30 days
gen OrgType = orgtype

// Keep only the required variables and order them correctly
keep AssociationType BoardName CompanyName DirectorName OverlapYearStart OverlapYearEnd Role AssociatedRole ConnCompanyOrgType BoardID CompanyID DirectorID StartCompanyDateStartRole StartCompanyDateEndRole ConnCompanyDateStartRole ConnCompanyDateEndRole OrgType ConnCompanyName

// Make sure we have reasonable number of connections
count
if r(N) > 10000 {
    // Keep a random subset if we have too many
    gen random_val = runiform()
    sort random_val
    keep in 1/10000
    drop random_val
}

// Save the boardex_other dataset
drop ConnCompanyName  // This isn't in the requested schema
save synthetic_boardex_other.dta, replace
export delimited using "synthetic_boardex_other.csv", replace
display as text "Created synthetic Boardex organizational network dataset"

********************************************************************************
* ISS DIRECTOR DATASET (2007-2018)
********************************************************************************
// For ISS datasets, we'll create them from the main BoardEx dataset
use synthetic_boardex.dta, clear

// Filter to relevant years
keep if year >= 2007 & year <= 2018

// Keep only essential variables
keep cusip year companyid directorid

// Rename to match ISS schema
rename companyid company_id

// Save ISS Director dataset
save synthetic_iss_director.dta, replace
export delimited using "synthetic_iss_director.csv", replace
display as text "Created synthetic ISS Director dataset (2007-2018)"

********************************************************************************
* ISS DIRECTOR LEGACY DATASET (1995-2006)
********************************************************************************
// Similar process for legacy years
use synthetic_boardex.dta, clear

// Filter to relevant years
keep if year >= 1995 & year <= 2006

// Keep only essential variables
keep cusip year companyid directorid

// Rename to match ISS schema
rename companyid company_id

// Save ISS Director Legacy dataset
save synthetic_iss_director_legacy.dta, replace
export delimited using "synthetic_iss_director_legacy.csv", replace
display as text "Created synthetic ISS Director Legacy dataset (1995-2006)"

********************************************************************************
* CREATE A README FILE
********************************************************************************
file open readme using "synthetic_data_README.txt", write replace
file write readme "COMPREHENSIVE SYNTHETIC DATA BASED ON REAL CUSIPS" _n _n
file write readme "This package contains synthetic financial datasets based on real CUSIP-year" _n
file write readme "combinations from your cusip_year.dta file." _n _n
file write readme "DATASETS INCLUDED:" _n _n
file write readme "1. synthetic_boardex.csv - Board roles dataset" _n
file write readme "2. synthetic_boardex_indiv.csv - Director information" _n
file write readme "3. synthetic_boardex_other.csv - Organizational network" _n
file write readme "4. synthetic_iss_director.csv - ISS data for 2007-2018" _n
file write readme "5. synthetic_iss_director_legacy.csv - ISS data for 1995-2006" _n _n
file write readme "All datasets are based on your real CUSIPs with synthetic information added." _n
file close readme

display as text "COMPLETE: All synthetic datasets have been generated"